{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd #导入Pandas\n",
    "import numpy as np #导入Numpy\n",
    "import jieba #导入结巴分词\n",
    "from keras.preprocessing import sequence\n",
    "from keras.optimizers import SGD, RMSprop, Adagrad\n",
    "from keras.utils import np_utils\n",
    "from keras.models import Sequential\n",
    "from keras.layers.core import Dense, Dropout, Activation\n",
    "from keras.layers.embeddings import Embedding\n",
    "from keras.layers.recurrent import LSTM, GRU\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "import re\n",
    "from collections import Counter, defaultdict\n",
    "from sklearn.model_selection import StratifiedKFold, KFold\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def build_dataset(words, vocabulary_size = 5000):\n",
    "    from collections import Counter\n",
    "    count = [['UNK', -1]]\n",
    "    count.extend(Counter(words).most_common(vocabulary_size - 1))\n",
    "    w_dictionary = {}\n",
    "    for word, _ in count:\n",
    "        w_dictionary[word] = len(w_dictionary)\n",
    "    da = list()\n",
    "    unk_count = 0\n",
    "    for word in words:\n",
    "        if word in w_dictionary:\n",
    "            index = w_dictionary[word]\n",
    "        else:\n",
    "            index = 0\n",
    "            unk_count += 1\n",
    "        da.append(index)\n",
    "    count[0][1] = unk_count\n",
    "    reverse_dictionary = {zip(w_dictionary.values(), w_dictionary.keys())}\n",
    "    return da, count, w_dictionary, reverse_dictionary\n",
    "\n",
    "def rmsel(true_label,pred):\n",
    "    rmse = np.sqrt(metrics.mean_squared_error(true_label, pred))\n",
    "    return rmse / (1 + rmse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Discuss</th>\n",
       "      <th>Score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>201e8bf2-77a2-3a98-9fcf-4ce03914e712</td>\n",
       "      <td>好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>f4d51947-eac4-3005-9d3c-2f32d6068a2d</td>\n",
       "      <td>新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>74aa7ae4-03a4-394c-bee0-5702d3a3082a</td>\n",
       "      <td>庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>099661c2-4360-3c49-a2fe-8c783764f7db</td>\n",
       "      <td>个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>97ca672d-e558-3542-ba7b-ee719bba1bab</td>\n",
       "      <td>迪斯尼一日游</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     Id  \\\n",
       "0  201e8bf2-77a2-3a98-9fcf-4ce03914e712   \n",
       "1  f4d51947-eac4-3005-9d3c-2f32d6068a2d   \n",
       "2  74aa7ae4-03a4-394c-bee0-5702d3a3082a   \n",
       "3  099661c2-4360-3c49-a2fe-8c783764f7db   \n",
       "4  97ca672d-e558-3542-ba7b-ee719bba1bab   \n",
       "\n",
       "                                             Discuss  Score  \n",
       "0              好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的      5  \n",
       "1                    新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！      4  \n",
       "2                庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去      4  \n",
       "3  个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...      5  \n",
       "4                                             迪斯尼一日游      5  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv('../input/train_first.csv')\n",
    "predict = pd.read_csv('../input/predict_first.csv')\n",
    "predict['Score'] = -1\n",
    "\n",
    "data = pd.concat([train, predict])\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "stop_word = []\n",
    "stop_words_path = '../input/stop_word.txt'\n",
    "with open(stop_words_path,encoding='utf-8') as f:\n",
    "    for line in f.readlines():\n",
    "        stop_word.append(line.strip())\n",
    "stop_word.append(' ')\n",
    "\n",
    "def clean_str(stri):\n",
    "    stri = re.sub(r'[a-zA-Z0-9]+','',stri)\n",
    "    cut_str = jieba.cut(stri.strip())\n",
    "    list_str = [word for word in cut_str if word not in stop_word]\n",
    "    return list_str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 1.342 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Discuss</th>\n",
       "      <th>Score</th>\n",
       "      <th>words</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>201e8bf2-77a2-3a98-9fcf-4ce03914e712</td>\n",
       "      <td>好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的</td>\n",
       "      <td>5</td>\n",
       "      <td>[好大, 一个, 游乐, 公园, 已经, 次, 感觉, 玩够, 第三, 第四次]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>f4d51947-eac4-3005-9d3c-2f32d6068a2d</td>\n",
       "      <td>新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！</td>\n",
       "      <td>4</td>\n",
       "      <td>[新, 中国, 成立, 举行, 中国, 人, 来说, 重要, 深刻, 意义]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>74aa7ae4-03a4-394c-bee0-5702d3a3082a</td>\n",
       "      <td>庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去</td>\n",
       "      <td>4</td>\n",
       "      <td>[庐山, 瀑布, 有名, 多个, 瀑布, 最, 好看, 非, 三叠, 泉莫属, 推荐, 一去]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>099661c2-4360-3c49-a2fe-8c783764f7db</td>\n",
       "      <td>个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...</td>\n",
       "      <td>5</td>\n",
       "      <td>[觉得, 颐和园, 北京, 最值, 一起, 地方, 相比, 下, 门票, 最贵, 故宫, 雄...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>97ca672d-e558-3542-ba7b-ee719bba1bab</td>\n",
       "      <td>迪斯尼一日游</td>\n",
       "      <td>5</td>\n",
       "      <td>[迪斯尼, 一日游]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     Id  \\\n",
       "0  201e8bf2-77a2-3a98-9fcf-4ce03914e712   \n",
       "1  f4d51947-eac4-3005-9d3c-2f32d6068a2d   \n",
       "2  74aa7ae4-03a4-394c-bee0-5702d3a3082a   \n",
       "3  099661c2-4360-3c49-a2fe-8c783764f7db   \n",
       "4  97ca672d-e558-3542-ba7b-ee719bba1bab   \n",
       "\n",
       "                                             Discuss  Score  \\\n",
       "0              好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的      5   \n",
       "1                    新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！      4   \n",
       "2                庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去      4   \n",
       "3  个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...      5   \n",
       "4                                             迪斯尼一日游      5   \n",
       "\n",
       "                                               words  \n",
       "0           [好大, 一个, 游乐, 公园, 已经, 次, 感觉, 玩够, 第三, 第四次]  \n",
       "1             [新, 中国, 成立, 举行, 中国, 人, 来说, 重要, 深刻, 意义]  \n",
       "2    [庐山, 瀑布, 有名, 多个, 瀑布, 最, 好看, 非, 三叠, 泉莫属, 推荐, 一去]  \n",
       "3  [觉得, 颐和园, 北京, 最值, 一起, 地方, 相比, 下, 门票, 最贵, 故宫, 雄...  \n",
       "4                                         [迪斯尼, 一日游]  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['words'] = data['Discuss'].apply(lambda x : clean_str(x))\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0. 0. 0. 0. 0. 1.]\n",
      " [0. 0. 0. 0. 1. 0.]\n",
      " [0. 0. 0. 0. 1. 0.]\n",
      " ...\n",
      " [0. 0. 0. 0. 0. 1.]\n",
      " [0. 0. 0. 0. 1. 0.]\n",
      " [0. 0. 0. 0. 0. 1.]]\n"
     ]
    }
   ],
   "source": [
    "# 回归转多分类\n",
    "y = np_utils.to_categorical(train['Score'])\n",
    "print(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0             [好大, 一个, 游乐, 公园, 已经, 次, 感觉, 玩够, 第三, 第四次]\n",
       "1               [新, 中国, 成立, 举行, 中国, 人, 来说, 重要, 深刻, 意义]\n",
       "2      [庐山, 瀑布, 有名, 多个, 瀑布, 最, 好看, 非, 三叠, 泉莫属, 推荐, 一去]\n",
       "3    [觉得, 颐和园, 北京, 最值, 一起, 地方, 相比, 下, 门票, 最贵, 故宫, 雄...\n",
       "4                                           [迪斯尼, 一日游]\n",
       "Name: words, dtype: object"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d2v_train = data['words'].copy()\n",
    "d2v_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['好大', '一个', '游乐', '公园', '已经', '次', '感觉', '玩够', '第三', '第四次', '新', '中国', '成立', '举行', '中国', '人', '来说', '重要', '深刻', '意义', '庐山', '瀑布', '有名', '多个', '瀑布', '最', '好看', '非', '三叠', '泉莫属', '推荐', '一去', '觉得', '颐和园', '北京', '最值', '一起', '地方', '相比', '下', '门票', '最贵', '故宫', '雄伟', '气势磅礴', '颐和园', '宁静', '波光粼粼', '美', '迪斯尼', '一日游', '方便', '看水', '看山', '感受', '古人', '智慧结晶', '秋景', '美丽', '如画', '红黄绿', '相间', '身体', '状况不佳', '人', '来说', '走平路', '接受', '赞', '唯一', '糟点', '周未', '周边游', '景点', '服务', '不错', '排队', '太长', '好玩', '项目', '人', '晚上', '烟火', '一定', '真的', '不错', '做好', '攻', '绍兴', '护城河', '夜游', '感觉', '不错', '一日游', '不错', '选择', '有趣', '荡气回肠', '年', '留下来']\n"
     ]
    }
   ],
   "source": [
    "all_words = []\n",
    "for i in d2v_train:\n",
    "    all_words.extend(i)\n",
    "print(all_words[0:100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['UNK', 0], ('好', 19050), ('不错', 17257), ('人', 15168), ('一个', 12281), ('地方', 12238), ('景区', 12129), ('值得', 9652), ('景点', 8966), ('感觉', 8526), ('风景', 8448), ('走', 7870), ('比较', 7247), ('美', 6662), ('里面', 6581), ('门票', 6463), ('景色', 6431), ('真的', 6185), ('看到', 5918), ('说', 5553), ('时间', 5227), ('北京', 5084), ('玩', 5045), ('特别', 4888), ('西湖', 4781), ('方便', 4780), ('最', 4778), ('元', 4773), ('中', 4715), ('喜欢', 4635), ('看看', 4624), ('挺', 4623), ('坐', 4209), ('建筑', 4204), ('中国', 4155), ('小', 4141), ('下', 4133), ('一定', 4020), ('再', 3988), ('历史', 3819), ('爬', 3815), ('小时', 3790), ('一去', 3756), ('里', 3754), ('觉得', 3675), (',', 3653), ('旅游', 3590), ('公园', 3570), ('建议', 3551), ('后', 3473), ('游客', 3455), ('适合', 3426), ('黄山', 3413), ('太', 3394), ('买', 3359), ('有点', 3338), ('特色', 3304), ('年', 3292), ('晚上', 3289), ('吃', 3279), ('位于', 3222), ('很大', 3211), ('想', 3115), ('推荐', 3104), ('一下', 3061), ('一次', 3051), ('现在', 2897), ('排队', 2859), ('游玩', 2835), ('需要', 2673), ('杭州', 2612), ('导游', 2537), ('索道', 2527), ('文化', 2507), ('已经', 2501), ('高', 2496), ('好玩', 2495), ('环境', 2474), ('游', 2419), ('壮观', 2397), ('主要', 2388), ('山', 2340), ('世界', 2330), ('爬山', 2302), ('天气', 2288), ('山上', 2283), ('东西', 2256), ('点', 2251), ('路', 2249), ('感受', 2248), ('知道', 2244), ('住', 2231), ('一些', 2222), ('一天', 2196), ('选择', 2161), ('逛', 2155), ('泰山', 2116), ('漂亮', 2114), ('站', 2086), ('朋友', 2065)]\n"
     ]
    }
   ],
   "source": [
    "da, count, w_dictionary, reverse_dictionary = build_dataset(all_words, vocabulary_size = len(all_words))\n",
    "print(count[0:100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>好</th>\n",
       "      <td>19050</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>不错</th>\n",
       "      <td>17257</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>人</th>\n",
       "      <td>15168</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>一个</th>\n",
       "      <td>12281</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>地方</th>\n",
       "      <td>12238</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        0  id\n",
       "好   19050   1\n",
       "不错  17257   2\n",
       "人   15168   3\n",
       "一个  12281   4\n",
       "地方  12238   5"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hhh = pd.DataFrame(pd.Series(all_words).value_counts()) #统计词的出现次数\n",
    "hhh.head()\n",
    "\n",
    "hhh['id']=list(range(1,len(hhh)+1))\n",
    "hhh.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Discuss</th>\n",
       "      <th>Score</th>\n",
       "      <th>words</th>\n",
       "      <th>sent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>201e8bf2-77a2-3a98-9fcf-4ce03914e712</td>\n",
       "      <td>好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的</td>\n",
       "      <td>5</td>\n",
       "      <td>[好大, 一个, 游乐, 公园, 已经, 次, 感觉, 玩够, 第三, 第四次]</td>\n",
       "      <td>[1500, 4, 872, 47, 74, 1228, 9, 21246, 3196, 1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>f4d51947-eac4-3005-9d3c-2f32d6068a2d</td>\n",
       "      <td>新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！</td>\n",
       "      <td>4</td>\n",
       "      <td>[新, 中国, 成立, 举行, 中国, 人, 来说, 重要, 深刻, 意义]</td>\n",
       "      <td>[408, 34, 3512, 1714, 34, 3, 238, 308, 726, 505]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>74aa7ae4-03a4-394c-bee0-5702d3a3082a</td>\n",
       "      <td>庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去</td>\n",
       "      <td>4</td>\n",
       "      <td>[庐山, 瀑布, 有名, 多个, 瀑布, 最, 好看, 非, 三叠, 泉莫属, 推荐, 一去]</td>\n",
       "      <td>[577, 108, 359, 2299, 108, 26, 157, 1143, 2239...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>099661c2-4360-3c49-a2fe-8c783764f7db</td>\n",
       "      <td>个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...</td>\n",
       "      <td>5</td>\n",
       "      <td>[觉得, 颐和园, 北京, 最值, 一起, 地方, 相比, 下, 门票, 最贵, 故宫, 雄...</td>\n",
       "      <td>[44, 367, 21, 50896, 102, 5, 873, 36, 15, 7858...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>97ca672d-e558-3542-ba7b-ee719bba1bab</td>\n",
       "      <td>迪斯尼一日游</td>\n",
       "      <td>5</td>\n",
       "      <td>[迪斯尼, 一日游]</td>\n",
       "      <td>[1306, 344]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     Id  \\\n",
       "0  201e8bf2-77a2-3a98-9fcf-4ce03914e712   \n",
       "1  f4d51947-eac4-3005-9d3c-2f32d6068a2d   \n",
       "2  74aa7ae4-03a4-394c-bee0-5702d3a3082a   \n",
       "3  099661c2-4360-3c49-a2fe-8c783764f7db   \n",
       "4  97ca672d-e558-3542-ba7b-ee719bba1bab   \n",
       "\n",
       "                                             Discuss  Score  \\\n",
       "0              好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的      5   \n",
       "1                    新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！      4   \n",
       "2                庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去      4   \n",
       "3  个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...      5   \n",
       "4                                             迪斯尼一日游      5   \n",
       "\n",
       "                                               words  \\\n",
       "0           [好大, 一个, 游乐, 公园, 已经, 次, 感觉, 玩够, 第三, 第四次]   \n",
       "1             [新, 中国, 成立, 举行, 中国, 人, 来说, 重要, 深刻, 意义]   \n",
       "2    [庐山, 瀑布, 有名, 多个, 瀑布, 最, 好看, 非, 三叠, 泉莫属, 推荐, 一去]   \n",
       "3  [觉得, 颐和园, 北京, 最值, 一起, 地方, 相比, 下, 门票, 最贵, 故宫, 雄...   \n",
       "4                                         [迪斯尼, 一日游]   \n",
       "\n",
       "                                                sent  \n",
       "0  [1500, 4, 872, 47, 74, 1228, 9, 21246, 3196, 1...  \n",
       "1   [408, 34, 3512, 1714, 34, 3, 238, 308, 726, 505]  \n",
       "2  [577, 108, 359, 2299, 108, 26, 157, 1143, 2239...  \n",
       "3  [44, 367, 21, 50896, 102, 5, 873, 36, 15, 7858...  \n",
       "4                                        [1306, 344]  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_sent(x, dictionary):\n",
    "    encode = []\n",
    "    for i in x:\n",
    "        if i in dictionary: encode.append(dictionary[i])\n",
    "        else: encode.append(0)\n",
    "    return encode\n",
    "    \n",
    "data['sent'] = data['words'].apply(lambda x : get_sent(x, w_dictionary))\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Discuss</th>\n",
       "      <th>Score</th>\n",
       "      <th>words</th>\n",
       "      <th>sent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>201e8bf2-77a2-3a98-9fcf-4ce03914e712</td>\n",
       "      <td>好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的</td>\n",
       "      <td>5</td>\n",
       "      <td>[好大, 一个, 游乐, 公园, 已经, 次, 感觉, 玩够, 第三, 第四次]</td>\n",
       "      <td>[1500, 4, 872, 47, 74, 1228, 9, 21246, 3196, 1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>f4d51947-eac4-3005-9d3c-2f32d6068a2d</td>\n",
       "      <td>新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！</td>\n",
       "      <td>4</td>\n",
       "      <td>[新, 中国, 成立, 举行, 中国, 人, 来说, 重要, 深刻, 意义]</td>\n",
       "      <td>[408, 34, 3512, 1714, 34, 3, 238, 308, 726, 505]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>74aa7ae4-03a4-394c-bee0-5702d3a3082a</td>\n",
       "      <td>庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去</td>\n",
       "      <td>4</td>\n",
       "      <td>[庐山, 瀑布, 有名, 多个, 瀑布, 最, 好看, 非, 三叠, 泉莫属, 推荐, 一去]</td>\n",
       "      <td>[577, 108, 359, 2299, 108, 26, 157, 1143, 2239...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>099661c2-4360-3c49-a2fe-8c783764f7db</td>\n",
       "      <td>个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...</td>\n",
       "      <td>5</td>\n",
       "      <td>[觉得, 颐和园, 北京, 最值, 一起, 地方, 相比, 下, 门票, 最贵, 故宫, 雄...</td>\n",
       "      <td>[44, 367, 21, 50896, 102, 5, 873, 36, 15, 7858...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>97ca672d-e558-3542-ba7b-ee719bba1bab</td>\n",
       "      <td>迪斯尼一日游</td>\n",
       "      <td>5</td>\n",
       "      <td>[迪斯尼, 一日游]</td>\n",
       "      <td>[1306, 344]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     Id  \\\n",
       "0  201e8bf2-77a2-3a98-9fcf-4ce03914e712   \n",
       "1  f4d51947-eac4-3005-9d3c-2f32d6068a2d   \n",
       "2  74aa7ae4-03a4-394c-bee0-5702d3a3082a   \n",
       "3  099661c2-4360-3c49-a2fe-8c783764f7db   \n",
       "4  97ca672d-e558-3542-ba7b-ee719bba1bab   \n",
       "\n",
       "                                             Discuss  Score  \\\n",
       "0              好大的一个游乐公园，已经去了2次，但感觉还没有玩够似的！会有第三，第四次的      5   \n",
       "1                    新中国成立也是在这举行，对我们中国人来说有些重要及深刻的意义！      4   \n",
       "2                庐山瀑布非常有名，也有非常多个瀑布，只是最好看的非三叠泉莫属，推荐一去      4   \n",
       "3  个人觉得颐和园是北京最值的一起的地方，不过相比下门票也是最贵的，比起故宫的雄伟与气势磅礴，颐...      5   \n",
       "4                                             迪斯尼一日游      5   \n",
       "\n",
       "                                               words  \\\n",
       "0           [好大, 一个, 游乐, 公园, 已经, 次, 感觉, 玩够, 第三, 第四次]   \n",
       "1             [新, 中国, 成立, 举行, 中国, 人, 来说, 重要, 深刻, 意义]   \n",
       "2    [庐山, 瀑布, 有名, 多个, 瀑布, 最, 好看, 非, 三叠, 泉莫属, 推荐, 一去]   \n",
       "3  [觉得, 颐和园, 北京, 最值, 一起, 地方, 相比, 下, 门票, 最贵, 故宫, 雄...   \n",
       "4                                         [迪斯尼, 一日游]   \n",
       "\n",
       "                                                sent  \n",
       "0  [1500, 4, 872, 47, 74, 1228, 9, 21246, 3196, 1...  \n",
       "1   [408, 34, 3512, 1714, 34, 3, 238, 308, 726, 505]  \n",
       "2  [577, 108, 359, 2299, 108, 26, 157, 1143, 2239...  \n",
       "3  [44, 367, 21, 50896, 102, 5, 873, 36, 15, 7858...  \n",
       "4                                        [1306, 344]  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = data[data['Score'] != -1]\n",
    "predict_df = data[data['Score'] == -1]\n",
    "del predict_df['Score']\n",
    "\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Discuss</th>\n",
       "      <th>words</th>\n",
       "      <th>sent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9a1caf96-681e-3c11-b588-43ac742d7fd2</td>\n",
       "      <td>快乐之旅</td>\n",
       "      <td>[快乐, 之旅]</td>\n",
       "      <td>[619, 409]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>82b450db-65c2-351c-84fb-761d76582680</td>\n",
       "      <td>岛上看日落的地方，视野很开阔，非常漂亮</td>\n",
       "      <td>[岛上, 日落, 地方, 视野, 开阔, 漂亮]</td>\n",
       "      <td>[405, 732, 5, 1218, 1022, 97]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2eec4606-590c-3fa2-b846-7f92441c54a6</td>\n",
       "      <td>很有鲁迅风味 很喜欢这样有文化的地方</td>\n",
       "      <td>[鲁迅, 风味, 喜欢, 文化, 地方]</td>\n",
       "      <td>[696, 1196, 29, 73, 5]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>509f9a68-ac41-35ff-9d2e-2fc12f73ed7f</td>\n",
       "      <td>去乌鲁木齐还能不去天山天池吗，哈哈哈～</td>\n",
       "      <td>[乌鲁木齐, 天山, 天池, 哈哈哈]</td>\n",
       "      <td>[7697, 3021, 378, 622]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>395f4b22-1c5f-328a-a19d-5065e0530cbc</td>\n",
       "      <td>非常满意，直接拿身份证刷机入园就行了，不用排队买票，比较节约时间</td>\n",
       "      <td>[满意, 直接, 身份证, 刷机, 入园, 就行了, 不用, 排队, 买票, 比较, 节约,...</td>\n",
       "      <td>[443, 103, 265, 77562, 1112, 2370, 172, 67, 19...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     Id                           Discuss  \\\n",
       "0  9a1caf96-681e-3c11-b588-43ac742d7fd2                              快乐之旅   \n",
       "1  82b450db-65c2-351c-84fb-761d76582680               岛上看日落的地方，视野很开阔，非常漂亮   \n",
       "2  2eec4606-590c-3fa2-b846-7f92441c54a6                很有鲁迅风味 很喜欢这样有文化的地方   \n",
       "3  509f9a68-ac41-35ff-9d2e-2fc12f73ed7f               去乌鲁木齐还能不去天山天池吗，哈哈哈～   \n",
       "4  395f4b22-1c5f-328a-a19d-5065e0530cbc  非常满意，直接拿身份证刷机入园就行了，不用排队买票，比较节约时间   \n",
       "\n",
       "                                               words  \\\n",
       "0                                           [快乐, 之旅]   \n",
       "1                           [岛上, 日落, 地方, 视野, 开阔, 漂亮]   \n",
       "2                               [鲁迅, 风味, 喜欢, 文化, 地方]   \n",
       "3                                [乌鲁木齐, 天山, 天池, 哈哈哈]   \n",
       "4  [满意, 直接, 身份证, 刷机, 入园, 就行了, 不用, 排队, 买票, 比较, 节约,...   \n",
       "\n",
       "                                                sent  \n",
       "0                                         [619, 409]  \n",
       "1                      [405, 732, 5, 1218, 1022, 97]  \n",
       "2                             [696, 1196, 29, 73, 5]  \n",
       "3                             [7697, 3021, 378, 622]  \n",
       "4  [443, 103, 265, 77562, 1112, 2370, 172, 67, 19...  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predict_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pad sequences (samples x time)\n"
     ]
    }
   ],
   "source": [
    "maxlen = 10\n",
    "print(\"Pad sequences (samples x time)\")\n",
    "\n",
    "train_df['sent'] = list(sequence.pad_sequences(train_df['sent'], maxlen=maxlen))\n",
    "predict_df['sent'] = list(sequence.pad_sequences(predict_df['sent'], maxlen=maxlen))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Build model...\n",
      "Train on 7 samples, validate on 3 samples\n",
      "Epoch 1/2\n",
      "7/7 [==============================] - 4s 535ms/step - loss: 1.7831 - acc: 0.7143 - val_loss: 1.7458 - val_acc: 0.6667\n",
      "Epoch 2/2\n",
      "7/7 [==============================] - 1s 88ms/step - loss: 1.7349 - acc: 0.8571 - val_loss: 1.7041 - val_acc: 0.6667\n",
      "rmse: 0.6819533514783339\n",
      "Build model...\n",
      "Train on 8 samples, validate on 2 samples\n",
      "Epoch 1/2\n",
      "8/8 [==============================] - 5s 637ms/step - loss: 1.7870 - acc: 0.3750 - val_loss: 1.7425 - val_acc: 0.5000\n",
      "Epoch 2/2\n",
      "8/8 [==============================] - 1s 83ms/step - loss: 1.7295 - acc: 0.7500 - val_loss: 1.6888 - val_acc: 0.5000\n",
      "rmse: 0.6631396020648007\n",
      "Build model...\n",
      "Train on 8 samples, validate on 2 samples\n",
      "Epoch 1/2\n",
      "8/8 [==============================] - 8s 947ms/step - loss: 1.8035 - acc: 0.0000e+00 - val_loss: 1.7932 - val_acc: 0.0000e+00\n",
      "Epoch 2/2\n",
      "8/8 [==============================] - 1s 95ms/step - loss: 1.7366 - acc: 0.8750 - val_loss: 1.7824 - val_acc: 0.0000e+00\n",
      "rmse: 0.6710814729784078\n",
      "Build model...\n",
      "Train on 9 samples, validate on 1 samples\n",
      "Epoch 1/2\n",
      "9/9 [==============================] - 6s 671ms/step - loss: 1.7936 - acc: 0.0000e+00 - val_loss: 1.7194 - val_acc: 1.0000\n",
      "Epoch 2/2\n",
      "9/9 [==============================] - 1s 68ms/step - loss: 1.7569 - acc: 0.6667 - val_loss: 1.6440 - val_acc: 1.0000\n",
      "rmse: 0.705824230022326\n",
      "Build model...\n",
      "Train on 8 samples, validate on 2 samples\n",
      "Epoch 1/2\n",
      "8/8 [==============================] - 4s 537ms/step - loss: 1.7842 - acc: 0.3750 - val_loss: 1.7453 - val_acc: 0.5000\n",
      "Epoch 2/2\n",
      "8/8 [==============================] - 1s 88ms/step - loss: 1.7355 - acc: 0.8750 - val_loss: 1.7029 - val_acc: 0.5000\n",
      "rmse: 0.6605599464491422\n"
     ]
    }
   ],
   "source": [
    "nfolds = 5\n",
    "def training(train_df, train_label, test_df):\n",
    "    X = np.array(list(train_df['sent']))\n",
    "    y = np.array(np_utils.to_categorical(train_label))\n",
    "    T = np.array(list(test_df['sent']))\n",
    "    folds = list(StratifiedKFold(n_splits=nfolds, random_state=2018, shuffle=True).split(X, train_label.values))\n",
    "    \n",
    "    S_train = np.zeros((X.shape[0], 1)) # 训练样本数 * 模型个数\n",
    "    S_test = np.zeros((T.shape[0], 1))  # 测试集样本数 * 模型个数\n",
    "    S_test_n = np.zeros((T.shape[0], len(folds))) # 测试集样本数 * n_folds\n",
    "    \n",
    "    error = []\n",
    "    for j, (train_idx, test_idx) in enumerate(folds):\n",
    "        X_train = X[train_idx] # 训练集特征\n",
    "        y_train = y[train_idx] # 训练集标签\n",
    "\n",
    "        X_holdout = X[test_idx] # 待预测的输入\n",
    "        y_holdout = y[test_idx]\n",
    "        \n",
    "        print('Build model...')\n",
    "        model = Sequential()\n",
    "        model.add(Embedding(len(w_dictionary) + 1, 256))\n",
    "        model.add(LSTM(256)) # try using a GRU instead, for fun\n",
    "        model.add(Dropout(0.5))\n",
    "        model.add(Dense(6))\n",
    "        model.add(Activation('softmax'))\n",
    "        \n",
    "        model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])\n",
    "        model.fit(X_train, y_train, batch_size=32, nb_epoch=2,validation_data=(X_holdout,y_holdout))\n",
    "        \n",
    "        y_true = [np.argmax(i) for i in list(y_holdout)]\n",
    "        predictions = list(model.predict(X_holdout,batch_size=32))\n",
    "        y_pred = [np.sum(i * [0, 1, 2, 3, 4, 5]) for i in predictions]\n",
    "        print('rmse: {}'.format(rmsel(y_true, y_pred)) )\n",
    "        error.append(rmsel(y_true, y_pred))\n",
    "        \n",
    "        submission = list(model.predict(T, batch_size=32))\n",
    "        sub_pred = [np.sum(i * [0, 1, 2, 3, 4, 5]) for i in submission]\n",
    "        \n",
    "        S_train[test_idx] = np.array(y_pred).reshape(-1, 1)\n",
    "        S_test_n[:, j] = np.array(sub_pred)\n",
    "        \n",
    "    S_test[:] = S_test_n.mean(1).reshape(-1, 1)\n",
    "    return S_train, S_test, round(np.mean(error), 5)\n",
    "\n",
    "S_train, S_test, error = training(train_df[0:10], train_df['Score'][0:10], predict_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "error: 0.67651\n"
     ]
    }
   ],
   "source": [
    "train_out = train_df[['Id']]\n",
    "train_out['lstm_len_10'] = S_train\n",
    "train_out.to_csv('../models/__models__/train_lstm_len_10.csv', index = False)\n",
    "\n",
    "test_out = predict_df['Id']\n",
    "test_out['lstm_len_10'] = S_test\n",
    "\n",
    "test_out.to_csv('../models/__models__/test_lstm_len_10.csv', index = False)\n",
    "\n",
    "print('error: {}'.format(error))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
